load libs

library(DBI)
library(broom)
library(splines)
library(stringr)
library(dplyr)
library(ggplot2)

Load exploration

source("../6_nhanes_data/phesant.R")

exposure_vars <-
  read.delim("../data/select_vars.txt", header = FALSE)$V1
exposure_cols <- paste(exposure_vars, collapse = ", ")


# Load data and convert data types according to PHESEANT.
# Return data set and PHESEANT results.
load_data <- function(exposure_cols) {
  nhanes_db <- dbConnect(RSQLite::SQLite(), "../nhanes.sqlite")
  
  cols <-
    'BMXWAIST, RIDAGEYR, BMXHT, BMXBMI, BMXWT, RIAGENDR,SDMVPSU,SDMVSTRA,RIDRETH1, years,'
  data_sql <-
    paste('SELECT', cols, exposure_cols, 'FROM merged_table')
  
  data <- dbGetQuery(nhanes_db, data_sql)
  data <- na.omit(data)
  dbDisconnect(nhanes_db)
  
  phs_res <- phesant(data)
  
  data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])] <-
    lapply(data[, names(phs_res[phs_res %in% c("ordered", "unordered", "binary")])], as.factor)
  
  data[, names(phs_res[phs_res == "continuous"])] <-
    lapply(data[, names(phs_res[phs_res == "continuous"])], as.numeric)
  
  return(list(data = data, phs_res = phs_res))
  
}

data_phs <- load_data(exposure_cols)

data <- data_phs$data
phs_res <- data_phs$phs_res

Inverse Normal Distribution and build formula

invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}
min_max_norm <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}


build_formula <- function(base_model,exposure_var,inv=FALSE) {
  var <- "+ %s"
  if(inv) var <- " + invNorm(%s)" 
  
  as.formula(
    sprintf(paste0(base_model, var),
      exposure_var
    )
  )
}

EnWAS

enwas <-
  function(base_model,
           exposure_vars,
           data_set,
           inv_norm = FALSE) {
    num_var <- length(exposure_vars)
    
    model_list <- vector(mode = "list", length = num_var)
    num_cols <- sapply(data_set[, exposure_vars], is.numeric)
    num_cols <- names(num_cols[num_cols == TRUE])
    if (inv_norm) {
      data_set[, num_cols] <- lapply(data_set[, num_cols], invNorm)
    }
    
    
    
    
    association_list <- data.frame()
    factor_terms <- c() # to hold the factors terms
    num_var <- length(exposure_vars)
    for (i in 1:num_var) {
      exposure <- exposure_vars[i]
      # print(exposure)
      
      if (is.factor(data_set[, exposure])) {
        factor_terms <-
          c(factor_terms, paste0(exposure, levels(data_set[, exposure])))
      }
      
      model <- build_formula(base_model, exposure)
      
      mod <- lm(model, data_set)
      model_list[[i]] <- mod
      mod_df <- tidy(mod)
      association_list <-
        rbind(association_list, mod_df) # step 3b of algorithm
    }
    
    # comments the following code out for ignore the categorical phenotypes for now
    # xwas_list <-
    #   association_list[association_list$term %in% c(exposure_vars, factor_terms), ]
    xwas_list <-
      association_list[association_list$term %in% exposure_vars, ]

    
    # sd_x_list <-  sapply(data_set[,num_cols],sd)
    sd_x_list <-  sapply(data_set, function(x)
      sd(as.numeric(x)))
    for (var in exposure_vars) {
      xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] <-
        xwas_list[grepl(var, xwas_list$term), c('estimate', 'std.error')] * sd_x_list[var]
    }
    
    
    
    xwas_list$fdr <-
      p.adjust(xwas_list$p.value, method = 'BY')
    
    
    
    xwas_list <- xwas_list |>
      mutate(lower = estimate - 1.96 * std.error)  |>
      mutate(upper = estimate + 1.96 * std.error)
    
    return (list(model_list = model_list, enwas_res = xwas_list))
    
  }




# forest_plot <- function(xwas_result) {
#   xwas_result |>
#     ggplot(aes(
#       x = term,
#       y = estimate,
#       ymin = lower,
#       ymax = upper,
#       colour = estimate
#     )) +
#     geom_pointrange() +
#     coord_flip() +  # flip coordinates (puts labels on y axis)
#     xlab("Exposures") + ylab("Estimate (95% CI)") +
#     theme_bw()  # use a white background
# }



forest_plot <- function(xwas_result) {
  xwas_result$col <- as.numeric(rownames(xwas_result)) %% 2
  n <- nrow(xwas_result)
  xwas_result |> mutate(xmin = seq(0.5, n - 0.5, by = 1),
                        xmax = seq(1.5, n + 0.5, by = 1)) |>
    ggplot(aes(x = term,
               y = estimate,
               colour = estimate)) +
    geom_point(size = 2, position = position_dodge(width = 1)) +
    geom_errorbar(
      aes(ymin = lower, ymax = upper),
      position = position_dodge(width = 1),
      width = 0.5,
      cex = 1
    ) +
    geom_hline(yintercept = 0, linetype = 'dashed') +
    geom_rect(
      aes(
        xmin = xmin,
        xmax = xmax,
        ymin = -Inf,
        ymax = +Inf,
        fill = factor(col)
      ),
      color = 'black',
      alpha = 0.3
    ) +
    scale_fill_manual(values = c('white', 'grey78'), guide = 'none') +
    coord_flip() +  # flip coordinates (puts labels on y axis)
    xlab("Exposures") + ylab("Estimate (95% CI)") +
    theme_bw()  # use a white background
}






forest_plot_mult <- function(xwas_result_list) {
  xwas_result <- do.call("rbind", xwas_result_list)
  xwas_result$EnWAS <-
    rep(names(xwas_result_list), each = nrow(xwas_result_list[[1]]))
  
  tem_df <- xwas_result_list[[1]]
  n <- nrow(tem_df)
  tem_df$col <- as.numeric(rownames(tem_df)) %% 2
  tem_df <- tem_df|> mutate(xmin = seq(0.5, n - 0.5, by = 1),
                        xmax = seq(1.5, n + 0.5, by = 1)) 
  xwas_result |>  ggplot(aes(
      x = term,
      y = estimate,
      colour = EnWAS
    ))  +
     geom_point(size = 2, position = position_dodge(width = 1)) + 
     geom_errorbar(aes(ymin = lower, ymax = upper), 
                position = position_dodge(width = 1), width=0.5,cex=1) + 
    geom_hline(yintercept = 0, linetype = 'dashed') +
    geom_rect(data=tem_df,
      aes(
        xmin = xmin,
        xmax = xmax,
        ymin = -Inf,
        ymax = +Inf,
        fill = factor(col)
      ),
      color = 'black',
      alpha = 0.3
    ) +
    scale_fill_manual(values = c('white', 'grey78'), guide = 'none') +
    coord_flip() +  # flip coordinates (puts labels on y axis)
    xlab("Exposures") + ylab("Estimate (95% CI)") +
    theme_bw()  # use a white background
}

Run EnWAS

linear_model <- 'BMXWAIST ~ RIDAGEYR*RIAGENDR + BMXBMI'
linear_res2 <- enwas(linear_model, exposure_vars, data)



all_ns_model <-
  'BMXWAIST ~ ns(RIDAGEYR, knots = seq(20, 80, by = 10)) * RIAGENDR + ns(BMXBMI,knots = c(seq(15, 45, by = 5),seq(45,65,by=10)))'
all_ns_res <- enwas(all_ns_model, exposure_vars, data)


lm_inverse_res <- enwas(linear_model, exposure_vars, data,inv = TRUE)
ns_inverse_res <- enwas(all_ns_model, exposure_vars, data,inv = TRUE)



forest_plot(linear_res2$enwas_res)

forest_plot_mult(
  list(
    linear = linear_res2$enwas_res,
    ns = all_ns_res$enwas_res
  )
)

forest_plot_mult(
  list(
    lm_inv = lm_inverse_res$enwas_res,
    ns_inv = ns_inverse_res$enwas_res
  )
)

forest_plot_mult(
  list(
    linear = linear_res2$enwas_res,
    lm_inv = lm_inverse_res$enwas_res
  )
)

forest_plot_mult(
  list(
    ns = all_ns_res$enwas_res,
    ns_inv = ns_inverse_res$enwas_res
  )
)

EnWAS show interesting variables

xwas_res_list <-
  list(
    linear = lm_inverse_res$enwas_res,
    ns = all_ns_res$enwas_res,
    lm_inv = lm_inverse_res$enwas_res,
    ns_inv = ns_inverse_res$enwas_res
  )
xwas_result <- do.call("rbind", xwas_res_list)
xwas_result$EnWAS <-
  rep(names(xwas_res_list), each = nrow(xwas_res_list[[1]]))

xwas_result$postive <- xwas_result$lower * xwas_result$upper
xwas_result$postive <- xwas_result$postive >= 0.0001

# show the false positive raised in linear base model
num_cols <- sapply(data[, exposure_vars], is.numeric)
num_cols <- names(num_cols[num_cols == TRUE])
ns_vs_linear <-
  xwas_result[xwas_result$EnWAS == "linear", ]$postive != xwas_result[xwas_result$EnWAS ==
                                                                       "ns", ]$postive
num_cols[ns_vs_linear]

[1] “DR1TPROT” “DR1TFIBE” “DR1TCRYP” “DR1TNIAC” “DR1TSELE”

for (c in num_cols[ns_vs_linear]) {
  g <-
    ggplot(data,
           aes_string(
             x = "RIDAGEYR",
             y = paste0("invNorm(", c, ")"),
             colour = "RIAGENDR"
           )) +
    geom_point(alpha = 0.1) +
    geom_smooth(
      method = lm,
      formula = y ~ ns(x, knots = seq(30, 80, by = 10)),
      size = 1
    )
  print(g)
}

# ns vs inverse norm transformation
ns_vs_ns_inv <-
  xwas_result[xwas_result$EnWAS == "ns_inv", ]$postive != xwas_result[xwas_result$EnWAS ==
                                                                        "ns", ]$postive
num_cols[ns_vs_ns_inv]

[1] “DR1TPROT” “DR1TCRYP” “DR1TNIAC” “DR1TVK” “DR1TSELE”

for (c in num_cols[ns_vs_ns_inv]) {
  g <-
    ggplot(data,
           aes_string(
             x = "RIDAGEYR",
             y = paste0("invNorm(", c, ")"),
             colour = "RIAGENDR"
           )) +
    geom_point(alpha = 0.1) +
    geom_smooth(
      method = lm,
      formula = y ~ ns(x, knots = seq(20, 80, by = 10)),
      size = 1
    )
  print(g)
  par(mfrow = c(1, 2))
  hist(data[, c], main = c)
  hist(invNorm(data[, c]), main = paste0("invNorm:", c))
}
## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(model, newdata = new_data_frame(list(x = xseq)), se.fit =
## se, : prediction from a rank-deficient fit may be misleading

Testing Enthnic Groups Linear Model

enthics <- c('Mexican American','Other Hispanic','Non-Hispanic White','Non-Hispanic Black','Other Race')
lm_base <- lm(as.formula(linear_model),data)
lm_df <- data_frame(residuals=lm_base$residual,ethnic = data$RIDRETH1,gender=data$RIAGENDR)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
levels(lm_df$ethnic) <- enthics
ggplot(lm_df, aes(x=ethnic, y=residuals, fill=gender)) + geom_boxplot()+
  scale_x_discrete(guide = guide_axis(angle = 45)) 

Testing Enthnic Groups Spline Model

ns_base <- lm(as.formula(all_ns_model),data)
ns_df <- data_frame(residuals=ns_base$residual,ethnic = data$RIDRETH1,gender=data$RIAGENDR)
levels(ns_df$ethnic) <- enthics
ggplot(ns_df, aes(x=ethnic, y=residuals, fill=gender)) + geom_boxplot()+
  scale_x_discrete(guide = guide_axis(angle = 45))